Correcting Stations Data and adding latitudes and longitudes.
Sys.Date()
[1] "2019-03-26"
Feel The Station Data
colSums(is.na(stations))
Station_ID Station_Name Go_live_date Region Status
0 3 3 3 3
Find Missing Stations
sapply(stations, function(itr) stations[which(is.na(itr)),]$Station_ID)$Station_Name
[1] 4110 4118 4276
Fixing Missing Stations
i_4110<-which(stations$Station_ID=="4110")
stations[ i_4110,]<- c(Station_ID="4110",
Station_Name="Soul Cycle",
Go_live_date="2017-09-07",
Region="DTLA",
Status="Inactive")
i_4276<-which(stations$Station_ID=="4276")
stations[ i_4276,]<- c(Station_ID="4276",
Station_Name="Mariachi Plaza",
Go_live_date="2017-12-02",
Region="DTLA",
Status="Inactive")
#https://www.laworks.com/opportunity/a0C1N00000GHHzqUAH
##This one needs to be recoded for all the other files.
i_4118<-which(stations$Station_ID=="4118")
stations[ i_4118,]<- c(Station_ID="4118",
Station_Name="Channing St",
Go_live_date="2017-9-07",
Region="DTLA",
Status="Inactive")
# These two stations were special events for 3/26/2017.
#https://thecabe.com/forum/threads/ciclavia-venice-beach-california-march-26-2017.107254/
station_3009<- c(Station_ID="3009",
Station_Name="Windward and Pacific",
Go_live_date="2017-26-03",
Region="Venice",
Status="Inactive")
stations<- rbind(stations, station_3009)
station_3039<- c(Station_ID="3039",
Station_Name="Culver and Washington",
Go_live_date="2017-26-03",
Region="Venice",
Status="Inactive")
stations<- rbind(stations, station_3039)
# This station is the same as the Olive and 5th station, but it moved so far, it needs a new name.
station_9999<- c(Station_ID="9999",
Station_Name="Olive and 6th",
Go_live_date="2016-10-01",
Region="DTLA",
Status="Inactive",
latitude = 34.048038,
longitude = -118.253738)
stations<- rbind(stations, station_9999)
Finding Active/Inactive Stations/Region
#virtual region N/A
table(activeAllStationList$Region)
DTLA N/A Port of LA Venice
68 1 12 14
The loops below try to find the most frequent latitudes and longitudes associated with each station. Some stations have multiple latitudes and longitudes with varying numbers. We’ll do this to standardize the data. Once we find the most frequent latitude and longitude, we’ll write it to the stations file.
bikes<- readxl::read_xlsx("./data/LABikeData.xlsx")
-
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
-
\
|
/
getLatLongForStation <- function(stationId)
{
startLatLongList<-split(bikes, bikes$start_station==stationId)[["TRUE"]][c("start_lat","start_lon")]
endLatLongList<-split(bikes, bikes$end_station==stationId)[["TRUE"]][c("end_lat","end_lon")]
uniqLat <- unique(c(startLatLongList$start_lat,endLatLongList$end_lat))
uniqLong <- unique(c(startLatLongList$start_lon,endLatLongList$end_lon))
maxNoOfObsForLat<-rep(NA, 0)
maxNoOfObsForLong<-rep(NA,0)
for (lat in uniqLat)
{
startLatCount<- nrow(startLatLongList[startLatLongList$start_lat==lat,])
if(is.null(startLatCount))
startLatCount<-0
endLatCount <- nrow(endLatLongList[endLatLongList$end_lat==lat, ])
if(is.null(endLatCount))
endLatCount<-0
latCount<-startLatCount + endLatCount
maxNoOfObsForLat<-c(maxNoOfObsForLat, latCount)
}
for (long in uniqLong)
{
startLongCount<- nrow(startLatLongList[startLatLongList$start_lon==long,])
if(is.null(startLongCount))
startLongCount<-0
endLongCount <- nrow(endLatLongList[endLatLongList$end_lon==long, ])
if(is.null(endLongCount))
endLongCount<-0
LongCount<-startLongCount + endLongCount
maxNoOfObsForLong<-c(maxNoOfObsForLong, LongCount)
}
LatDf<-data.frame(uniqLat,maxNoOfObsForLat)
LongDf<-data.frame(uniqLong, maxNoOfObsForLong)
tmpLat<-0
tmpLong<-0
if(dim(LatDf)[1] >=1 & dim(LatDf)[2] >=1)
tmpLat<-LatDf[order(LatDf$maxNoOfObsForLat, decreasing = TRUE),][1,1]
if(dim(LongDf)[1] >=1 & dim(LongDf)[2] >=1)
tmpLong<-LongDf[order(LongDf$maxNoOfObsForLong, decreasing = TRUE),][1,1]
return(c(tmpLat, tmpLong))
}
system.time(vOfLatLong<-sapply(stations$Station_ID, getLatLongForStation))
user system elapsed
129.645 8.987 138.871
cleanStations<-cbind(stations, latitude=unname(vOfLatLong[1,]), longitude=unname(vOfLatLong[2,]))
more cleanup station id 4164, 4217 no entry in bike data removing
cleanStations<-cleanStations[!cleanStations$Station_ID==4164, ]
cleanStations<-cleanStations[!cleanStations$Station_ID==4217, ]
write.csv(cleanStations, "./data/stations_cleaned.csv", row.names = FALSE)
Clean Bike Data
colSums(is.na(bikes))
trip_id bike_id start_station end_station trip_route_category
0 0 0 43198 0
start_time end_time start_lat start_lon end_lat
0 0 1354 1354 9110
end_lon plan_duration passholder_type
9110 384 0
NA’s for bike data columns ..need to fix end_station (43198) start_lat (1354), start_lon (1354), end_lat (9110), end_lon (9110), plan_duration (384)
Fix the missing end stations:
for(i in 1:length(bikes$end_station)){
if(is.na(bikes$end_station[i])){
latitude<- bikes$end_lat[i]
index<- match(latitude, cleanStations$latitude)
if(index!=1 | is.na(index) ){
if(is.na(index)){
longitude<- bikes$end_lon[i]
index2<- match(longitude, cleanStations$longitude)
bikes$end_station[i]<- cleanStations[index2, "Station_ID"]
}
else{
bikes$end_station[i]<- cleanStations[index, "Station_ID"]
}
}
}
}
Look at the missing data again to see what we have.
nrow(bikes[which(is.na(bikes$end_station)), ])
[1] 1663
Fill in the missing stations that have missing end lat/lon as virtual stations.
for(i in 1:length(bikes$end_station)){
if(is.na(bikes$end_station[i])){
bikes$end_station[i]<- 3000
}
}
Start latitude & longitude are NA ..they are mapped to virtual station 3000
unique(bikes[which(is.na(bikes$start_lat)), ]$start_station)
[1] 3000
unique(bikes[which(is.na(bikes$start_lon)), ]$start_station)
[1] 3000
Now fix for End latitude & longitude both has mapped to virtual station 3000
unique(bikes[which(is.na(bikes$end_lat)), ]$end_station)
[1] "3000"
unique(bikes[which(is.na(bikes$end_lon)), ]$end_station)
[1] "3000"
After looking at the map, location 4118 and 4108 are the same. Code the 4118 as 4008.
bikes[which(bikes$end_station==4118), "end_station"]<- 4108
passholders and plan duration.
There are 269 coded as 150. that are monthly passes. going to recode those as 30.
table(bikes$plan_duration, bikes$passholder_type)
Annual Pass Flex Pass Monthly Pass One Day Pass Walk-up
0 0 0 0 0 132566
1 0 0 0 23319 87262
30 0 0 365449 0 2276
150 0 0 269 0 0
365 2057 25160 1044 0 0
for(i in 1:length(bikes$start_station)){
if(bikes$plan_duration[i] == 150 & !is.na(bikes$plan_duration[i]) ){
bikes$plan_duration[i]<- 30
}
}
There are 384 coded as na
it looks like all these are monthly passholders. adding 30 in for the duration on these too.
bikes[which(is.na(bikes$plan_duration)), ]
bikes[is.na(bikes$plan_duration), ]$passholder_type
[1] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[8] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[15] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[22] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[29] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[36] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[43] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[50] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[57] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[64] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[71] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[78] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[85] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[92] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[99] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[106] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[113] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[120] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[127] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[134] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[141] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[148] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[155] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[162] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[169] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[176] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[183] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[190] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[197] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[204] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[211] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[218] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[225] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[232] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[239] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[246] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[253] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[260] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[267] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[274] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[281] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[288] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[295] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[302] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[309] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[316] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[323] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[330] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[337] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[344] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[351] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[358] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[365] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[372] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
[379] "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass" "Monthly Pass"
for(i in 1:length(bikes$start_station)){
if(bikes$passholder_type[i] == "Monthly Pass" & is.na(bikes$plan_duration[i]) ){
bikes$plan_duration[i]<- 30
}
}
#all the plan durations are fixed now
summary(as.factor(bikes$plan_duration))
0 1 30 365
132566 110581 368378 28261
So, lots of walkups buy a full day pass and some even buy a monthly pass. 1044 monthly passes coded as 365 day passes. not sure what those are.
nothing really unique about these all different times and locations.
table(bikes$plan_duration,bikes$passholder_type)
Annual Pass Flex Pass Monthly Pass One Day Pass Walk-up
0 0 0 0 0 132566
1 0 0 0 23319 87262
30 0 0 366102 0 2276
365 2057 25160 1044 0 0
bikes[which(bikes$plan_duration==365 & bikes$passholder_type=="Monthly Pass"), ]
Most of the NA values should took care, double check.
summary(bikes)
trip_id bike_id start_station end_station trip_route_category start_time
Min. : 1912818 Length:639786 Min. :3000.000 Length:639786 Length:639786 Length:639786
1st Qu.: 28656588 Class :character 1st Qu.:3029.000 Class :character Class :character Class :character
Median : 63803192 Mode :character Median :3052.000 Mode :character Mode :character Mode :character
Mean : 61519730 Mean :3300.809
3rd Qu.: 96710610 3rd Qu.:3082.000
Max. :112732252 Max. :4276.000
end_time start_lat start_lon end_lat end_lon
Length:639786 Min. : 0.00000 Min. :-118.4913 Min. : 0.00000 Min. :-118.4913
Class :character 1st Qu.:34.04113 1st Qu.:-118.2612 1st Qu.:34.04060 1st Qu.:-118.2609
Mode :character Median :34.04681 Median :-118.2524 Median :34.04661 Median :-118.2528
Mean :34.04127 Mean :-118.2645 Mean :34.04022 Mean :-118.2619
3rd Qu.:34.05110 3rd Qu.:-118.2410 3rd Qu.:34.05088 3rd Qu.:-118.2388
Max. :34.16529 Max. : 118.2383 Max. :34.16529 Max. : 0.0000
NA's :1354 NA's :1354 NA's :9110 NA's :9110
plan_duration passholder_type
Min. : 0.00000 Length:639786
1st Qu.: 1.00000 Class :character
Median : 30.00000 Mode :character
Mean : 33.56933
3rd Qu.: 30.00000
Max. :365.00000
now we need to fix outliers in the data.
Starting to look for outliers. We’ll do the obvious outliers first, the locations with 0 lat/lon, or positive lon
temp1<- bikes[which(bikes$start_lon>0),]
temp1
cleanStations[which(cleanStations$Station_ID=="3039"), ]
#this one is coded as stn 3039 but we know that one was only active for an event. go by the start lat/lon
match(temp1$start_lon, cleanStations$longitude)
[1] NA
match(temp1$start_lat, cleanStations$latitude)
[1] NA
#they don't match anything in our station data.
# the map indicates channing street and the LA warehouse stations 4108 and 4118
# code this one as 4108 1 la warehouse
bikes[which(bikes$start_lon>0),c("start_station", "start_lon")]<- c(4108, -118.238258)
#Need to just combine 1 LA WHSE and Channing ST.
bikes[which(bikes$end_station=="4118"), "end_station" ] <- 4108
bikes[which(bikes$start_station=="4108"),]
Now the zeros
bikes_temp <- bikes #save this off in case i screw something up.
# start location
temp<- bikes[which(bikes$start_lon==0 | bikes$start_lat==0),]
index<- match( temp$start_station, cleanStations$Station_ID) #67
temp
index # these are all the la warehouse
[1] 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67
bikes[which(bikes$start_lon==0|bikes$start_lat==0), "start_lon"]<- cleanStations$longitude[67]
bikes[which(bikes$start_lon==0|bikes$start_lat==0), "start_lat"]<- cleanStations$latitude[67]
#end location
temp<- bikes[which(bikes$end_lon==0 | bikes$end_lat==0),]
match( temp$end_station, cleanStations$Station_ID)# all 67 again
[1] 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67 67
[38] 67 67 67 67 67 67 67 67 67 67 67
bikes[which(bikes$end_lon==0 | bikes$end_lat==0),"end_lon"]<- cleanStations$longitude[67]
bikes[which(bikes$end_lon==0 | bikes$end_lat==0),"end_lat"]<- cleanStations$latitude[67]
Now that the locations appear to be fixed, let’s look at the summary again
summary(bikes)
trip_id bike_id start_station end_station trip_route_category start_time
Min. : 1912818 Length:639786 Min. :3000.000 Length:639786 Length:639786 Length:639786
1st Qu.: 28656588 Class :character 1st Qu.:3029.000 Class :character Class :character Class :character
Median : 63803192 Mode :character Median :3052.000 Mode :character Mode :character Mode :character
Mean : 61519730 Mean :3300.811
3rd Qu.: 96710610 3rd Qu.:3082.000
Max. :112732252 Max. :4276.000
end_time start_lat start_lon end_lat end_lon
Length:639786 Min. :33.71098 Min. :-118.4913 Min. :33.71098 Min. :-118.4913
Class :character 1st Qu.:34.04113 1st Qu.:-118.2612 1st Qu.:34.04060 1st Qu.:-118.2609
Mode :character Median :34.04681 Median :-118.2524 Median :34.04661 Median :-118.2528
Mean :34.04298 Mean :-118.2708 Mean :34.04281 Mean :-118.2709
3rd Qu.:34.05110 3rd Qu.:-118.2410 3rd Qu.:34.05088 3rd Qu.:-118.2388
Max. :34.16529 Max. :-118.1165 Max. :34.16529 Max. :-118.1165
NA's :1354 NA's :1354 NA's :9110 NA's :9110
plan_duration passholder_type
Min. : 0.00000 Length:639786
1st Qu.: 1.00000 Class :character
Median : 30.00000 Mode :character
Mean : 33.56933
3rd Qu.: 30.00000
Max. :365.00000
Joining data frames
sum(is.na(bikes$start_time))
[1] 0
start_stations<- cleanStations
end_stations<- cleanStations
colnames(start_stations)<- c("start_station", "start_station_Name",
"start_Go_live_date",
"start_Region", "start_Status", "start_latitude",
"start_longitude")
colnames(end_stations)<- c("end_station", "end_station_Name", "end_Go_live_date",
"end_Region", "end_Status", "end_latitude",
"end_longitude")
colnames(bikes)
[1] "trip_id" "bike_id" "start_station" "end_station" "trip_route_category"
[6] "start_time" "end_time" "start_lat" "start_lon" "end_lat"
[11] "end_lon" "plan_duration" "passholder_type"
Joining the stations data table to the bikes data table. Coding the start and end locations.
df_1<- merge(bikes, start_stations, all.x=TRUE, sort=FALSE)
bikes_full<- merge(df_1, end_stations, all.x=TRUE, sort=FALSE)
revisit types doesnt make sense to me for end station do we need to fix or
summary(bikes_full)
end_station start_station trip_id bike_id trip_route_category start_time
Length:639786 Min. :3000.000 Min. : 1912818 Length:639786 Length:639786 Length:639786
Class :character 1st Qu.:3029.000 1st Qu.: 28656588 Class :character Class :character Class :character
Mode :character Median :3052.000 Median : 63803192 Mode :character Mode :character Mode :character
Mean :3300.811 Mean : 61519730
3rd Qu.:3082.000 3rd Qu.: 96710610
Max. :4276.000 Max. :112732252
end_time start_lat start_lon end_lat end_lon
Length:639786 Min. :33.71098 Min. :-118.4913 Min. :33.71098 Min. :-118.4913
Class :character 1st Qu.:34.04113 1st Qu.:-118.2612 1st Qu.:34.04060 1st Qu.:-118.2609
Mode :character Median :34.04681 Median :-118.2524 Median :34.04661 Median :-118.2528
Mean :34.04298 Mean :-118.2708 Mean :34.04281 Mean :-118.2709
3rd Qu.:34.05110 3rd Qu.:-118.2410 3rd Qu.:34.05088 3rd Qu.:-118.2388
Max. :34.16529 Max. :-118.1165 Max. :34.16529 Max. :-118.1165
NA's :1354 NA's :1354 NA's :9110 NA's :9110
plan_duration passholder_type start_station_Name start_Go_live_date start_Region start_Status
Min. : 0.00000 Length:639786 Length:639786 Length:639786 Length:639786 Length:639786
1st Qu.: 1.00000 Class :character Class :character Class :character Class :character Class :character
Median : 30.00000 Mode :character Mode :character Mode :character Mode :character Mode :character
Mean : 33.56933
3rd Qu.: 30.00000
Max. :365.00000
start_latitude start_longitude end_station_Name end_Go_live_date end_Region end_Status
Min. :33.71098 Min. :-118.4913 Length:639786 Length:639786 Length:639786 Length:639786
1st Qu.:34.04113 1st Qu.:-118.2612 Class :character Class :character Class :character Class :character
Median :34.04681 Median :-118.2524 Mode :character Mode :character Mode :character Mode :character
Mean :34.04299 Mean :-118.2708
3rd Qu.:34.05110 3rd Qu.:-118.2410
Max. :34.16529 Max. :-118.1165
NA's :1354 NA's :1354
end_latitude end_longitude
Min. :33.71098 Min. :-118.4913
1st Qu.:34.04060 1st Qu.:-118.2612
Median :34.04661 Median :-118.2528
Mean :34.04281 Mean :-118.2710
3rd Qu.:34.05088 3rd Qu.:-118.2388
Max. :34.16529 Max. :-118.1165
NA's :10287 NA's :10287
Let’s explore the starting lat and the one that came from the station table (most frequent location). I’ll just use a cartesian distance because we don’t need too much accuracy.
start_lat_diff<- (bikes_full$start_lat - bikes_full$start_latitude)*1.15077945*60*5280
start_lon_diff<- (bikes_full$start_lon - bikes_full$start_longitude)*1.15077945*60*5280
end_lat_diff<- (bikes_full$end_lat - bikes_full$end_latitude)*1.15077945*60*5280
end_lon_diff<- (bikes_full$end_lon - bikes_full$end_longitude)*1.15077945*60*5280
plot(start_lon_diff, start_lat_diff, main="Difference in Start Position Most Frequent and Coded", xlab="Longitude Difference (ft)",ylab= "Latitude Diff (ft)",ylim=c(-500,500), xlim=c(-400,400))
grid()
plot(end_lon_diff, end_lat_diff, main="Difference in End Position Most Frequent and Coded", xlab="Longitude Difference (ft)",ylab= "Latitude Diff (ft)", ylim=c(-500,500), xlim=c(-400,400))
grid()
bikes_full[which(start_lat_diff< -300),] #4104 obs
bikes_full[which(end_lat_diff< -300),] #3787 obs
cleanStations[which(cleanStations$Station_ID=="3063"), ]
#looks like that's station 3063 at Pershing Square. That's the station 9999 that I added for 6th and olive.
cleanStations[which(cleanStations$Station_ID=="9999"), ]
#they mustve moved where the station 3063 was. I'm going to code these as olive and 6th since it's pretty far away from olive and 5th.
temp1<- cleanStations[which(cleanStations$Station_ID=="9999"), c('latitude', 'longitude', "Go_live_date", 'Status','Station_Name')]
bikes_full[which(start_lat_diff< -300), "start_station"]<- "9999"
bikes_full[which(start_lat_diff< -300), "start_latitude"]<- temp1[1]
bikes_full[which(start_lat_diff< -300), "start_longitude"]<- temp1[2]
bikes_full[which(start_lat_diff< -300), "start_Go_live_date"]<- temp1[3]
bikes_full[which(start_lat_diff< -300), "start_Status"]<- temp1[4]
bikes_full[which(start_lat_diff< -300), "start_station_Name"]<- temp1[5]
bikes_full[which(end_lat_diff< -300), "end_station"]<- "9999"
bikes_full[which(end_lat_diff< -300), "end_latitude"]<- temp1[1]
bikes_full[which(end_lat_diff< -300), "end_longitude"]<- temp1[2]
bikes_full[which(end_lat_diff< -300), "end_Go_live_date"]<- temp1[3]
bikes_full[which(end_lat_diff< -300), "end_Status"]<- temp1[4]
bikes_full[which(end_lat_diff< -300), "end_station_Name"]<- temp1[5]
bikes_full[which(start_lon_diff> 200),] #87 obs
bikes_full[which(end_lon_diff> 200),] #80 obs
# 3 stations Grand/LATTC, 7th & Westminster, Pasadena Central Library
# 4227, 4213, 4148
bikes_full[which(start_lon_diff> 200 & bikes_full$start_station=="4227"),] #37 here
# we'll code these as the starting locations in the station table and keep the original long/lat
temp1<- bikes_full[which(start_lon_diff > 200 & bikes_full$start_station=="4227"),c("start_lat", "start_lon")]
bikes_full[which(start_lon_diff > 200 & bikes_full$start_station=="4227"),"start_latitude"]<- temp1$start_lat[1]
bikes_full[which(start_lon_diff > 200 & bikes_full$start_station=="4227"),"start_longitude"]<- temp1$start_lon[1]
bikes_full[which(end_lon_diff > 200 & bikes_full$end_station=="4227"),"end_latitude"]<- temp1$start_lat[1]
bikes_full[which(end_lon_diff > 200 & bikes_full$end_station=="4227"),"end_longitude"]<- temp1$start_lon[1]
bikes_full[which(start_lon_diff> 200),] #50 left
bikes_full[which(start_lon_diff> 200 & bikes_full$start_station=="4213"),] #48 here
bikes_full[which(end_lon_diff> 200 & bikes_full$end_station=="4213"),] #40 here
#looks like they moved 7th and westminster down the street.
# we'll keep the lats/lons
temp1<- bikes_full[which(start_lon_diff > 200 & bikes_full$start_station=="4213"),c("start_lat", "start_lon")]
bikes_full[which(start_lon_diff > 200 & bikes_full$start_station=="4213"),"start_latitude"]<- temp1$start_lat[1]
bikes_full[which(start_lon_diff > 200 & bikes_full$start_station=="4213"),"start_longitude"]<- temp1$start_lon[1]
bikes_full[which(end_lon_diff > 200 & bikes_full$end_station=="4213"),"end_latitude"]<- temp1$start_lat[1]
bikes_full[which(end_lon_diff > 200 & bikes_full$end_station=="4213"),"end_longitude"]<- temp1$start_lon[1]
bikes_full[which(start_lon_diff> 200),] #2 left
bikes_full[which(end_lon_diff> 200),] #2 left
#pasadena library looks like they put the station in a different start and these 2 are before the live start date.
temp1<- bikes_full[which(start_lon_diff > 200 & bikes_full$start_station=="4148"),c("start_lat", "start_lon")]
bikes_full[which(start_lon_diff > 200 & bikes_full$start_station=="4148"),"start_latitude"]<- temp1$start_lat[1]
bikes_full[which(start_lon_diff > 200 & bikes_full$start_station=="4148"),"start_longitude"]<- temp1$start_lon[1]
bikes_full[which(end_lon_diff > 200 & bikes_full$end_station=="4148"),"end_latitude"]<- temp1$start_lat[1]
bikes_full[which(end_lon_diff > 200 & bikes_full$end_station=="4148"),"end_longitude"]<- temp1$start_lon[1]
bikes_full[which(start_lon_diff> 100),] #1364
bikes_full[which(end_lon_diff> 100),] #1041
#station 3046 2nd & Hill, looks like they had it around the corner for awhile.
temp1<- bikes_full[which(start_lon_diff > 100 & bikes_full$start_station=="3046"),c("start_lat", "start_lon")]
bikes_full[which(start_lon_diff > 100 & bikes_full$start_station=="3046"),"start_latitude"]<- temp1$start_lat[1]
bikes_full[which(start_lon_diff > 100 & bikes_full$start_station=="3046"),"start_longitude"]<- temp1$start_lon[1]
bikes_full[which(end_lon_diff > 100 & bikes_full$end_station=="3046"),"end_latitude"]<- temp1$start_lat[1]
bikes_full[which(end_lon_diff > 100 & bikes_full$end_station=="3046"),"end_longitude"]<- temp1$start_lon[1]
bikes_full[which(start_lon_diff< -100),] #3157
bikes_full[which(end_lon_diff< -100),] #4039
#looks like stn 3005 7th and flower got moved around the corner.
temp1<- bikes_full[which(start_lon_diff < -100 & bikes_full$start_station=="3005"),c("start_lat", "start_lon")]
bikes_full[which(start_lon_diff < -100 & bikes_full$start_station=="3005"),"start_latitude"]<- temp1$start_lat[1]
bikes_full[which(start_lon_diff < -100 & bikes_full$start_station=="3005"),"start_longitude"]<- temp1$start_lon[1]
bikes_full[which(end_lon_diff < -100 & bikes_full$end_station=="3005"),"end_latitude"]<- temp1$start_lat[1]
bikes_full[which(end_lon_diff < -100 & bikes_full$end_station=="3005"),"end_longitude"]<- temp1$start_lon[1]
# also theres station 4146 city hall west in pasadena 3 observations. looks like it got moved up the st. these trips were before the start date.
temp1<- bikes_full[which(start_lon_diff < -100 & bikes_full$start_station=="4146"),c("start_lat", "start_lon")]
bikes_full[which(start_lon_diff < -100 & bikes_full$start_station=="4146"),"start_latitude"]<- temp1$start_lat[1]
bikes_full[which(start_lon_diff < -100 & bikes_full$start_station=="4146"),"start_longitude"]<- temp1$start_lon[1]
bikes_full[which(end_lon_diff < -100 & bikes_full$end_station=="4146"),"end_latitude"]<- temp1$start_lat[1]
bikes_full[which(end_lon_diff < -100 & bikes_full$end_station=="4146"),"end_longitude"]<- temp1$start_lon[1]
start_lat_diff<- (bikes_full$start_lat - bikes_full$start_latitude)*1.15077945*60*5280
start_lon_diff<- (bikes_full$start_lon - bikes_full$start_longitude)*1.15077945*60*5280
end_lat_diff<- (bikes_full$end_lat - bikes_full$end_latitude)*1.15077945*60*5280
end_lon_diff<- (bikes_full$end_lon - bikes_full$end_longitude)*1.15077945*60*5280
plot(start_lon_diff, start_lat_diff, main="Difference in Start Position Most Frequent and Coded", xlab="Longitude Difference (ft)",ylab= "Latitude Diff (ft)",ylim=c(-500,500), xlim=c(-400,400))
grid()
plot(end_lon_diff, end_lat_diff, main="Difference in End Position Most Frequent and Coded", xlab="Longitude Difference (ft)",ylab= "Latitude Diff (ft)", ylim=c(-500,500), xlim=c(-400,400))
grid()
revisit 66
bikes_full[which(end_lat_diff> 20),] #28
#This is the LA Warehouse location. 80ft off. we'll leave these be. there's no indication where this station really is. it's in a sketchy part of town and the lat/lon indicated it's behind a fence with concertina wire.
summary(bikes_full)
end_station start_station trip_id bike_id trip_route_category start_time
Length:639786 Length:639786 Min. : 1912818 Length:639786 Length:639786 Length:639786
Class :character Class :character 1st Qu.: 28656588 Class :character Class :character Class :character
Mode :character Mode :character Median : 63803192 Mode :character Mode :character Mode :character
Mean : 61519730
3rd Qu.: 96710610
Max. :112732252
end_time start_lat start_lon end_lat end_lon
Length:639786 Min. :33.71098 Min. :-118.4913 Min. :33.71098 Min. :-118.4913
Class :character 1st Qu.:34.04113 1st Qu.:-118.2612 1st Qu.:34.04060 1st Qu.:-118.2609
Mode :character Median :34.04681 Median :-118.2524 Median :34.04661 Median :-118.2528
Mean :34.04298 Mean :-118.2708 Mean :34.04281 Mean :-118.2709
3rd Qu.:34.05110 3rd Qu.:-118.2410 3rd Qu.:34.05088 3rd Qu.:-118.2388
Max. :34.16529 Max. :-118.1165 Max. :34.16529 Max. :-118.1165
NA's :1354 NA's :1354 NA's :9110 NA's :9110
plan_duration passholder_type start_station_Name start_Go_live_date start_Region start_Status
Min. : 0.00000 Length:639786 Length:639786 Length:639786 Length:639786 Length:639786
1st Qu.: 1.00000 Class :character Class :character Class :character Class :character Class :character
Median : 30.00000 Mode :character Mode :character Mode :character Mode :character Mode :character
Mean : 33.56933
3rd Qu.: 30.00000
Max. :365.00000
start_latitude start_longitude end_station_Name end_Go_live_date end_Region end_Status
Min. : 0.00000 Min. :-118.4913 Length:639786 Length:639786 Length:639786 Length:639786
1st Qu.:34.04099 1st Qu.:-118.2612 Class :character Class :character Class :character Class :character
Median :34.04681 Median :-118.2524 Mode :character Mode :character Mode :character Mode :character
Mean :33.82411 Mean :-117.5106
3rd Qu.:34.05110 3rd Qu.:-118.2410
Max. :34.16529 Max. : 0.0000
NA's :1354 NA's :1354
end_latitude end_longitude
Min. : 0.00000 Min. :-118.4913
1st Qu.:34.04060 1st Qu.:-118.2612
Median :34.04652 Median :-118.2528
Mean :33.83797 Mean :-117.5596
3rd Qu.:34.05088 3rd Qu.:-118.2388
Max. :34.16529 Max. : 0.0000
NA's :10287 NA's :10287
Station 9999 is the same station as 5th and olive, but they moved it across the pershing square thing to 6th and olive.
Also, we’ll need to look at the live date and the trip date. There are some trips before the live date that are probably simulatons or test runs. This has an effect on some of the locations.
Rearrange the bikes full data
colnames(bikes_full)
[1] "end_station" "start_station" "trip_id" "bike_id" "trip_route_category"
[6] "start_time" "end_time" "start_lat" "start_lon" "end_lat"
[11] "end_lon" "plan_duration" "passholder_type" "start_station_Name" "start_Go_live_date"
[16] "start_Region" "start_Status" "start_latitude" "start_longitude" "end_station_Name"
[21] "end_Go_live_date" "end_Region" "end_Status" "end_latitude" "end_longitude"
bikes_full_ordered<- bikes_full[ , c("trip_id", "bike_id", "trip_route_category",
"start_station", "end_station",
"start_station_Name", "end_station_Name",
"start_lat", "start_lon",
"end_lat", "end_lon",
"start_Region", "end_Region",
"start_Status", "end_Status",
"start_time", "end_time",
"plan_duration", "passholder_type",
"start_Go_live_date", "end_Go_live_date",
"start_latitude", "start_longitude",
"end_latitude", "end_longitude")]
head(bikes_full_ordered)
Now that we’ve got the stations taken care of, we can start looking at the times.
bikes<- bikes_full_ordered
bikes$start_time<- as.POSIXct.POSIXlt(strptime(bikes$start_time, "%d/%m/%Y %I:%M:%S %p", tz='PST8PDT'))
bikes$end_time<- as.POSIXct.POSIXlt(strptime(bikes$end_time, "%d/%m/%Y %I:%M:%S %p", tz='PST8PDT'))
# Bikes Data
bikes$start_Go_live_date<- as.POSIXct.POSIXlt(strptime(bikes$start_Go_live_date, format="%m/%d/%Y", tz='PST8PDT'))
bikes$end_Go_live_date<- as.POSIXct.POSIXlt(strptime(bikes$end_Go_live_date, format="%m/%d/%Y", tz='PST8PDT'))
First, We’ll take a look at the duration. The difftime gives minutes
#there's some nas in the start and end time. we'll have to ignore them for now.
duration<- bikes$end_time - bikes$start_time
duration<- as.numeric(duration) #minutes
summary(duration)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-116.00000 7.00000 12.00000 38.33932 24.00000 8849.00000
length(duration[duration<=0]) #there's 7 values where the difference is <=0
[1] 7
duration[duration<=0]
[1] -104 -109 -106 -111 -108 -114 -116
bikes[which(duration<=0),]
bikes[which(duration<=0),c("start_time", "end_time", "bike_id")]
#all of these happened on 11/5 around 1AM or 2AM. This was the Daylight Saving time change! we need to add an hour to the end time. #we'll have to look at the other time changes as well. Need to add 2 hours because R accounts for the time switch.
temp1<- as.POSIXct(bikes[which(duration<=0),"end_time"] + 3600*2, tz='PST8PDT')
bikes[which(duration<=0),"end_time"]<- temp1
duration<- difftime(time1=bikes$end_time, time2=bikes$start_time, units = "mins")
duration<- as.numeric(duration) #minutes
summary(duration)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00000 7.00000 12.00000 38.34063 24.00000 8849.00000
#ok. those are fixed.
#now to look at the other time changes.
#3/12/17 2AM->3AM
test_time1<- strptime("12/03/2017 01:00:00 AM", format="%d/%m/%Y %I:%M:%S %p", tz='PST8PDT')
test_time2<- strptime("12/03/2017 03:00:00 AM", format="%d/%m/%Y %I:%M:%S %p", tz='PST8PDT')
bikes[which(bikes$start_time>test_time1 & bikes$start_time<test_time2),c("start_time", "end_time")]
duration[which(bikes$start_time>test_time1 & bikes$start_time<test_time2)]
[1] 3 939 14 41 908 4 17 13
#Looks like R takes care of these so long as they're coded as correct times. No need to worry about it.
#Now let's look at long durations duration > 1440 minutes (1 day)
bikes[which(duration>1440),] # There's 2123 rows here. that's a lot!
bikes[which(duration>1440 & bikes$end_station=="3000"),] #1322 are where the ending station is 3000.
# I'm going to assume that these bikes weren't returned.
summary(duration[which(duration>1440&bikes$end_station=="3000")]) # we need to tag these as such somehow.
Min. 1st Qu. Median Mean 3rd Qu. Max.
1442.000 1787.500 2380.500 2578.438 2961.750 8849.000
bikes[which(duration>1440 & bikes$end_station!="3000"),] #801 left
summary(duration[which(duration>1440&bikes$end_station!="3000")]) #anywhere from a day to 6 days
Min. 1st Qu. Median Mean 3rd Qu. Max.
1442.000 1610.000 2013.000 2367.436 2698.000 8678.000
# I'll change the end times to 1401 minutes after the start time. That's 1 minute greater than a day. #need to note that in the summary
temp1<- as.POSIXct(bikes[which(duration>1440),"start_time"] + 60*60*24+60, tz='PST8PDT')
bikes[which(duration>1440),"end_time"]<- temp1
duration<- difftime(time1=bikes$end_time, time2=bikes$start_time, units = "mins")
duration<- as.numeric(duration) #minutes
summary(duration)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00000 7.00000 12.00000 34.83045 24.00000 1441.00000
hist(duration)
#revisit ..
#Error in hist.default(duration, xlim = c(0, 60), breaks = seq(from = 0, : some 'x' not counted; maybe 'breaks' do not span range of 'x'
#hist(duration, xlim=c(0,30), breaks=seq(from=0, to=1445, by=1))
#hist(duration, xlim=c(0,60), breaks=seq(from=0, to=1445, by=1))
It looks like most of the trips are pretty short. Let’s make a new attribute that is the number of pay periods (30 minute intervals). I’ll use the ceiling function to round up if the duration goes up by a minute.
pay_periods<- ceiling(duration/30)
hist(pay_periods) #looks the same. Let's look at a table
pay_period_counts<- summary(as.factor(pay_periods))
duration_counts<- summary(as.factor(duration))
hist(pay_periods, xlim=c(0,4), breaks=seq(from=0, to=50, by=1))
hist(pay_periods, xlim=c(0,15), breaks=seq(from=0, to=50, by=1))
#Percent of pay periods
cumsum(pay_period_counts)/length(duration)
1 2 3 4 5 6 7 8 9
0.8111180926 0.9083474787 0.9435920761 0.9600788389 0.9690365216 0.9742992188 0.9795994286 0.9821971722 0.9839149341
10 11 12 13 14 15 16 17 18
0.9851903605 0.9862250815 0.9873613990 0.9880538180 0.9886477666 0.9891776313 0.9896793615 0.9900873104 0.9904639989
19 20 21 22 23 24 25 26 27
0.9907640992 0.9910610735 0.9913017790 0.9915409215 0.9917425514 0.9919785678 0.9921833238 0.9924068360 0.9925834576
28 29 30 31 32 33 34 35 36
0.9928085329 0.9930367342 0.9932618094 0.9934978258 0.9937416574 0.9939385982 0.9941558584 0.9943574883 0.9945825635
37 38 39 40 41 42 43 44 45
0.9947982607 0.9949873864 0.9951702601 0.9953421925 0.9955328813 0.9957188810 0.9958986286 0.9960846283 0.9962409306
46 47 48 49
0.9963800396 0.9965207116 0.9966817029 1.0000000000
pay_period_counts/length(duration)
1 2 3 4 5 6 7
0.8111180926122 0.0972293860760 0.0352445974123 0.0164867627613 0.0089576827252 0.0052626972144 0.0053002097576
8 9 10 11 12 13 14
0.0025977436205 0.0017177618766 0.0012754264707 0.0010347209848 0.0011363174561 0.0006924190276 0.0005939486016
15 16 17 18 19 20 21
0.0005298646735 0.0005017302661 0.0004079489079 0.0003766884552 0.0003001003461 0.0002969743008 0.0002407054859
22 23 24 25 26 27 28
0.0002391424633 0.0002016299200 0.0002360164180 0.0002047559653 0.0002235122369 0.0001766215578 0.0002250752595
29 30 31 32 33 34 35
0.0002282013048 0.0002250752595 0.0002360164180 0.0002438315312 0.0001969408521 0.0002172601464 0.0002016299200
36 37 38 39 40 41 42
0.0002250752595 0.0002156971237 0.0001891257389 0.0001828736484 0.0001719324899 0.0001906887616 0.0001859996936
43 44 45 46 47 48 49
0.0001797476031 0.0001859996936 0.0001563022636 0.0001391090146 0.0001406720372 0.0001609913315 0.0033182970556
#Percent of duration
cumsum(duration_counts)/length(duration)
6 5 7 8 4 9 10 11 12
0.0638447856 0.1260999772 0.1876799430 0.2430765912 0.2967445365 0.3451872970 0.3902476766 0.4302485519 0.4656525776
3 13 14 15 16 17 18 19 20
0.5006048898 0.5319856952 0.5606155808 0.5865383112 0.6098836173 0.6303310795 0.6486043771 0.6647691572 0.6798429475
2 21 22 1 23 24 25 26 27
0.6944837805 0.7082821443 0.7209801402 0.7336078001 0.7453414110 0.7564826364 0.7670064678 0.7769785522 0.7864786038
28 29 30 31 32 33 34 35 36
0.7953737656 0.8035577521 0.8111180926 0.8175890063 0.8233503078 0.8285833075 0.8335005768 0.8380989893 0.8424801418
37 38 39 40 41 1441 43 42 44
0.8466815466 0.8505672209 0.8543200383 0.8578884189 0.8612707999 0.8645890970 0.8677401506 0.8708114901 0.8738124936
46 45 48 47 49 50 51 52 54
0.8766900182 0.8795159632 0.8822309335 0.8849287105 0.8874592442 0.8899397611 0.8923577571 0.8947757531 0.8971234131
53 55 56 57 58 59 60 61 62
0.8993319641 0.9015108177 0.9036756040 0.9057794325 0.9078207401 0.9098104679 0.9116657757 0.9134569997 0.9151763246
65 63 64 66 67 68 69 71 72
0.9167518514 0.9183211261 0.9198872748 0.9213893396 0.9228132532 0.9241605787 0.9254438203 0.9267083056 0.9279696649
70 74 75 73 76 78 79 77 82
0.9291888225 0.9304048541 0.9315802471 0.9327493881 0.9338591341 0.9349407458 0.9360192314 0.9370727087 0.9381230599
81 80 85 84 86 87 88 83 89
0.9391124532 0.9400768382 0.9410052736 0.9418790033 0.9427417918 0.9435873870 0.9444314193 0.9452738885 0.9461132316
90 91 92 94 96 93 95 101 97
0.9469103732 0.9476778173 0.9484311942 0.9491642518 0.9498785531 0.9505537789 0.9512227526 0.9518792221 0.9525294395
(Other)
1.0000000000
plot(ecdf(duration), ylim=c(0,1))
plot(ecdf(duration), xlim=c(0,60), ylim=c(0,1))
plot(ecdf(pay_periods), xlim=c(0,10), ylim=c(0,1))
summary(as.factor(pay_periods))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
518942 62206 22549 10548 5731 3367 3391 1662 1099 816 662 727 443 380 339 321
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
261 241 192 190 154 153 129 151 131 143 113 144 146 144 151 156
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
126 139 129 144 138 121 117 110 122 119 115 119 100 89 90 103
49
2123
#81.1 percent of the data is < 30 minutes
# 9.7 percent of the data is >30 minutes and < 60 minutes
# we can make a new attribute called short trips for trips < 30 minutes. This will help when determining the cost.
Let’s look at the trips where the trip date was before the go live date
a<-bikes[which(bikes$start_time < bikes$start_Go_live_date), ] #There's 94 in the start #all in pasadena and port of LA
b<- bikes[which(bikes$end_time < bikes$end_Go_live_date), ] #93 here.
bikes[which(bikes$trip_id==setdiff(a$trip_id, b$trip_id)),]
a
b
# we should just drop these from the data. They appear to be test trips.
How about trips between Regions
between_regions<- bikes[which(bikes$start_Region != bikes$end_Region & bikes$start_station!="3000" & bikes$end_station != "3000"), ] # 681 trips all one way (makes sense).
table(between_regions$start_Region, between_regions$end_Region) #most from dtla to venice or pasadena and venice to LA.
DTLA Pasadena Port of LA Venice
DTLA 0 40 5 307
Pasadena 138 0 0 0
Port of LA 9 0 0 2
Venice 169 1 10 0
duration[which(bikes$start_Region != bikes$end_Region & bikes$start_station!="3000" & bikes$end_station != "3000") ]
[1] 63 68 105 110 485 167 82 486 46 107 133 81 111 108 41 57 67 135 59 167 129 72
[23] 66 53 101 54 58 27 41 192 136 51 70 173 82 60 172 59 129 82 184 543 529 278
[45] 279 149 88 91 196 132 132 82 89 106 152 133 67 150 71 87 200 116 392 391 105 104
[67] 290 93 128 82 1324 339 622 1317 289 291 336 1405 79 155 403 626 108 338 131 1381 95 225
[89] 97 1403 111 127 113 137 284 340 1441 241 128 128 111 189 136 122 322 149 90 93 125 29
[111] 43 180 128 55 231 40 131 32 112 112 74 141 141 283 96 144 97 144 77 1122 97 320
[133] 58 75 94 467 81 179 111 114 116 344 139 134 337 82 79 158 157 138 137 70 93 126
[155] 193 178 230 1441 210 1074 26 273 1286 183 1296 185 1169 88 1159 311 42 184 20 12 375 184
[177] 186 182 1064 182 595 970 1253 182 19 214 348 377 310 29 444 1204 185 185 380 31 12 188
[199] 321 216 52 256 865 887 182 217 458 9 20 442 834 1121 1122 808 182 185 239 122 190 182
[221] 1374 150 13 184 382 13 1015 1272 188 518 360 191 182 434 1123 101 192 439 84 430 192 182
[243] 11 182 222 191 28 353 192 127 182 191 824 1207 184 825 9 187 183 356 133 198 96 95
[265] 163 165 88 166 183 179 314 128 130 181 225 94 64 55 130 49 144 46 54 415 110 73
[287] 92 41 343 342 55 52 50 61 79 71 136 129 142 137 140 213 285 136 87 107 104 211
[309] 141 103 172 85 77 124 174 359 143 133 340 139 123 163 142 136 88 115 141 408 334 89
[331] 128 1009 112 1014 614 388 610 161 122 237 66 236 162 234 96 43 146 137 71 179 178 141
[353] 141 143 141 950 950 1441 777 774 444 1441 204 1441 1441 1441 77 69 80 158 149 204 304 304
[375] 550 124 91 181 417 363 400 170 206 180 366 156 416 1069 130 273 1068 105 538 536 395 394
[397] 334 242 120 335 745 102 378 72 37 208 182 163 115 272 229 274 208 141 260 249 169 212
[419] 113 207 151 244 157 170 255 137 244 118 201 202 677 210 211 161 124 194 187 403 644 320
[441] 165 1039 215 180 95 126 274 183 179 221 181 122 108 163 140 186 1376 54 166 397 301 159
[463] 191 125 131 122 236 155 561 300 304 127 338 120 174 154 167 341 385 207 280 164 208 162
[485] 149 201 151 284 154 340 184 227 143 156 342 191 462 319 161 168 161 302 369 234 104 163
[507] 306 177 459 141 369 130 185 169 284 231 162 131 351 342 267 435 130 116 186 98 190 135
[529] 214 177 136 115 142 213 137 148 146 128 179 102 181 148 136 172 191 175 130 94 141 135
[551] 215 175 140 83 109 150 90 151 98 73 126 524 108 121 124 148 163 169 125 119 96 213
[573] 134 478 132 148 126 289 84 136 138 107 248 114 157 199 76 132 118 148 111 138 122 79
[595] 111 106 86 125 210 61 1441 124 134 142 73 140 171 142 82 141 127 120 211 49 77 140
[617] 76 141 115 150 143 211 210 90 277 277 141 273 140 275 277 122 277 110 230 235 103 161
[639] 254 551 233 142 247 143 140 105 107 108 105 152 154 124 103 174 145 149 173 159 580 171
[661] 67 115 102 103 169 1441 40 114 704 268 267 49 158 112 465 431 286 248 287 240 212
between_regions[,]
write.csv(bikes, "./data/bicycle_clean.csv", row.names = FALSE)